Days between publication date and appearing on best seller list (Filtered looking just at Fiction) - using Alexis’ data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)

library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Load dataset
nyt_bestseller <- read.csv("NYT_best_seller .csv", stringsAsFactors = FALSE)

# Filter for fiction books (assuming fiction-related categories contain 'Fiction' in 'list_name')
fiction_df <- subset(nyt_bestseller, grepl("Fiction", list_name, ignore.case = TRUE))

# Convert 'bestsellers_date' and 'published_date' to Date format
fiction_df$bestsellers_date <- as.Date(fiction_df$bestsellers_date, format="%m/%d/%y")
fiction_df$published_date <- as.Date(fiction_df$published_date, format="%m/%d/%y")

# Filter for books published between 2010 and 2016
fiction_df <- subset(fiction_df, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)

# Calculate time difference in days between publication and best seller date
fiction_df$days_to_best_seller <- as.numeric(difftime(fiction_df$bestsellers_date, fiction_df$published_date, units="days"))

# Create improved scatter plot with trend line
ggplot(fiction_df, aes(x = published_date, y = days_to_best_seller)) +
  geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) +  # Add color gradient based on days
  geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +   # Add trend line (LOESS)
  scale_color_gradient(low = "blue", high = "red") +                     # Gradient from blue (low) to red (high)
  scale_x_date(labels = date_format("%Y"), breaks = "1 year") +          # Format x-axis to show years
  labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
       x = "Publication Date",
       y = "Days to Become Bestseller",
       color = "Days to Bestseller") +
  theme_minimal(base_size = 14) +                                        # Clean theme with larger text
  theme(axis.text.x = element_text(angle = 45, hjust = 1))               # Rotate x-axis labels for readability
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Includes all categories in Alexis’ data

# Load necessary libraries
library(ggplot2)
library(scales)  # For formatting dates



# Convert 'bestsellers_date' and 'published_date' to Date format
nyt_bestseller$bestsellers_date <- as.Date(nyt_bestseller$bestsellers_date, format="%m/%d/%y")
nyt_bestseller$published_date <- as.Date(nyt_bestseller$published_date, format="%m/%d/%y")

# Filter for books published between 2010 and 2016
nyt_bestseller <- subset(nyt_bestseller, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)

# Calculate time difference in days between publication and best seller date
nyt_bestseller$days_to_best_seller <- as.numeric(difftime(nyt_bestseller$bestsellers_date, nyt_bestseller$published_date, units="days"))

# Create improved scatter plot with trend line
ggplot(nyt_bestseller, aes(x = published_date, y = days_to_best_seller)) +
  geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) +  # Add color gradient based on days
  geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +   
  scale_color_gradient(low = "blue", high = "red") +                     # Gradient from blue (low) to red (high)
  scale_x_date(labels = date_format("%Y"), breaks = "1 year") +          # Format x-axis to show years
  labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
       x = "Publication Date",
       y = "Days to Become Bestseller",
       color = "Days to Bestseller") +
  theme_minimal(base_size = 14) +                                        # Clean theme with larger text
  theme(axis.text.x = element_text(angle = 45, hjust = 1))               # Rotate x-axis labels for readability
## `geom_smooth()` using formula = 'y ~ x'

library(dplyr)
print(nyt_bestseller$days_to_best_seller)
##   [1] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
##  [19] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
##  [37] -14 -14 -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
##  [55] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
##  [73] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
##  [91] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [109] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [127] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [145] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [163] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [181] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [199] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [217] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [235] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14
## [253] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [271] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [289] -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [307] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [325] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [343] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [361] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [379] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [397] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [415] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [433] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [451] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [469] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [487] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14 -14 -14
## [505] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [523] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [541] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [559] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [577] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [595] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [613] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [631] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [649] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [667] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [685] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [703] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [721] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [739] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [757] -15 -15 -15 -15

Top 10 publishers - Using Alexis’ data (Filtered for 2010-2016)

# Filter for 2010-2016 

# Load required libraries
library(dplyr)
library(readr)

# Read the dataset
df <- read_csv("NYT_best_seller_08_16.csv", show_col_types = FALSE)

# Convert published_date to Date format (correcting MM/DD/YY format)
df$published_date <- as.Date(df$published_date, format="%m/%d/%y")

# Check date range before filtering
print(range(df$published_date, na.rm = TRUE))
## [1] "2008-06-08" "2016-06-12"
# Filter data to only include books published between 2010 and 2016
df_filtered <- df %>%
  filter(published_date >= as.Date("2010-01-01") & published_date <= as.Date("2016-12-31"))

# Count the number of books per publisher
publisher_counts_2010_2016 <- df_filtered %>%
  count(publisher, name = "n") %>%
  arrange(desc(n))

# Ensure data exists before saving
print(dim(publisher_counts_2010_2016))
## [1] 115   2
print(head(publisher_counts_2010_2016))
## # A tibble: 6 × 2
##   publisher         n
##   <chr>         <int>
## 1 Grand Central    63
## 2 Bantam           54
## 3 Berkley          40
## 4 Vintage          28
## 5 Ballantine       27
## 6 Putnam           26
# Display filtered dataset
print(publisher_counts_2010_2016)
## # A tibble: 115 × 2
##    publisher            n
##    <chr>            <int>
##  1 Grand Central       63
##  2 Bantam              54
##  3 Berkley             40
##  4 Vintage             28
##  5 Ballantine          27
##  6 Putnam              26
##  7 Dell                25
##  8 Little, Brown       24
##  9 Simon & Schuster    20
## 10 Knopf               19
## # ℹ 105 more rows

Visualization for Publisher counts in NYT Best Sellers - Alexis’ Data (2010-2016)

ggplot(publisher_counts_2010_2016, aes(x = reorder(publisher, n), y = n)) + 
  geom_bar(stat = "identity", fill = "skyblue") + 
  coord_flip() + 
  labs(title = "Publisher Counts in NYT Best Sellers", 
       subtitle = "Data from 2010-2016",
       x = "Publisher",
       y = "Number of Best Sellers") +
  theme_minimal()

Top 10 Publishers 2010-2016

top_publishers <- publisher_counts_2010_2016 %>%
  arrange(desc(n)) %>%
  head(10)  # Keep only the top 10 publishers

# Create the ggplot visualization
ggplot(top_publishers, aes(x = reorder(publisher, n), y = n)) + 
  geom_bar(stat = "identity", fill = "red") + 
  coord_flip() + 
  labs(title = "Top 10 Publishers in NYT Best Sellers", 
       subtitle = "Data from 2010-2016",
       x = "Publisher",
       y = "Number of Best Sellers") +
  theme_minimal()

Month of publication for best sellers (not looking at categories), using Lucas’ data. UNCLEAN

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(lubridate)

# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)

# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")

# Extract month name from published_date
data$month <- format(data$published_date, "%B")

# Order months 
data$month <- factor(data$month, levels = month.name, ordered = TRUE)

# Count number of books published per month
monthly_counts <- data %>%
  group_by(month) %>%
  summarise(count = n())

# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  labs(title = "Frequency of Bestseller Books Released Per Month",
       x = "Month",
       y = "Number of Bestsellers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stacked bar. UNCLEAN

updated_categories <- read.csv("Updated_Bestsellers_Data_Cleaned.csv")


library(ggplot2)
library(dplyr)
library(lubridate)

# Load the updated dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)

# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")  # Extract full month name

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribution of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Heat map. UNCLEANED DATA

library(ggplot2)
library(dplyr)
library(lubridate)

# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)


data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
heatmap_data <- data %>%
  count(month, New_Category)

# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal() +
  labs(title = "Heatmap of Bestsellers by Month and Category", 
       x = "Month", 
       y = "Category", 
       fill = "Count") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stacked area chart. UNCLEANED DATA

library(ggplot2)
library(dplyr)
library(lubridate)

# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)

# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
area_chart_data <- data %>%
  count(month, New_Category)

# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
  geom_area(position = "stack", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stacked Area Chart of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

USING COMPLETELY CLEAN DATA TO VISUALIZE MONTHS OF BEST SELLERS

# Per month 
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")

# Extract month name from published_date
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name, ordered = TRUE)

# Count number of books published per month
monthly_counts <- data %>%
  group_by(month) %>%
  summarise(count = n())

# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  labs(title = "Frequency of Bestseller Books Released Per Month",
       x = "Month",
       y = "Number of Bestsellers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stacked Area Chart of categories by month with clean data

data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
area_chart_data <- data %>%
  count(month, New_Category)

# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
  geom_area(position = "stack", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stacked Area Chart of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Heat map of Bestsellers by month and category with clean data

data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
heatmap_data <- data %>%
  count(month, New_Category)

# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal() +
  labs(title = "Heatmap of Bestsellers by Month and Category", 
       x = "Month", 
       y = "Category", 
       fill = "Count") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribution of bestellers by month and category with clean data

data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")  

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribution of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Count for each author on best seller list - Using Lucas’ Data (CLEANED)

nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author 
author_counts <- nyt_authors %>% 
  group_by(author) %>% 
  summarise(count = n ()) %>% 
  arrange(desc(count)) 

print(author_counts)
## # A tibble: 3,045 × 2
##    author                            count
##    <chr>                             <int>
##  1 Danielle Steel                      113
##  2 David Baldacci                      100
##  3 Nora Roberts                         80
##  4 John Grisham                         74
##  5 James Patterson                      68
##  6 Debbie Macomber                      67
##  7 James Patterson and Maxine Paetro    63
##  8 John Sandford                        58
##  9 Stuart Woods                         57
## 10 Janet Evanovich                      55
## # ℹ 3,035 more rows

Author and their respective catgeory

# Load necessary libraries
library(dplyr)
library(tibble)

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each category
author_counts <- nyt_authors %>% 
  group_by(author, New_Category) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(desc(count))

# Convert to tibble for better display
author_tibble <- as_tibble(author_counts)

# Print the tibble
print(author_tibble)
## # A tibble: 3,351 × 3
##    author                            New_Category count
##    <chr>                             <chr>        <int>
##  1 Danielle Steel                    Fiction         90
##  2 David Baldacci                    Fiction         81
##  3 Nora Roberts                      Fiction         71
##  4 John Grisham                      Fiction         59
##  5 Stuart Woods                      Fiction         57
##  6 James Patterson                   Fiction         52
##  7 James Patterson and Maxine Paetro Fiction         51
##  8 John Sandford                     Fiction         47
##  9 Stephen King                      Fiction         47
## 10 Lee Child                         Fiction         43
## # ℹ 3,341 more rows

Visualization of top 20 authors overall

top_authors <- author_counts %>% top_n(20, count)

ggplot(top_authors, aes(x = reorder(author, count), y = count, fill = count)) +
  geom_bar(stat = "identity") + 
  coord_flip() + 
  scale_fill_gradient(low = "blue", high = "red") + 
  labs(title = "Top 20 Authors on NYT Bestseller List", 
       x = "Author", 
       y = "Number of Appearances") + 
  theme_minimal(base_size = 14)

Top 10 Authors in Each Category (not excluding if multiple authors have same count)

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each new category
author_counts <- nyt_authors %>% 
  group_by(New_Category, author) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(New_Category, desc(count))

# Select top 20 authors per category
top_authors <- author_counts %>%
  group_by(New_Category) %>%
  slice_max(order_by = count, n = 10)

# Create bar chart with facet wrap
ggplot(top_authors, aes(x = reorder(author, count), y = count, fill = count)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  scale_fill_gradient(low = "blue", high = "red") +  # Color gradient
  facet_wrap(~ New_Category, scales = "free_y") +  # Separate plots per category
  labs(title = "Top 10 Authors in Each Category",
       x = "Author",
       y = "Number of Appearances",
       fill = "Count") +
  theme_minimal(base_size = 12) +
  theme(axis.text.y = element_text(size = 8))

Using with_ties = FALSE to ensure exactly 10 authors per category

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(forcats) 

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each new category
author_counts <- nyt_authors %>% 
  group_by(New_Category, author) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(New_Category, desc(count))

# Select top 10 authors per category, ensuring exactly 10 per group
top_authors <- author_counts %>%
  group_by(New_Category) %>%
  slice_max(order_by = count, n = 10, with_ties = FALSE) %>%
  ungroup()  # Remove grouping for plotting

# Create bar chart with facet wrap, ensuring correct ordering within facets
ggplot(top_authors, aes(x = forcats::fct_reorder(author, count, .desc = TRUE), y = count, fill = count)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  scale_fill_gradient(low = "blue", high = "red") +  # Color gradient
  facet_wrap(~ New_Category, scales = "free_y") +  # Separate plots per category
  labs(title = "Top 10 Authors in Each Category",
       x = "Author",
       y = "Number of Appearances",
       fill = "Count") +
  theme_minimal(base_size = 12) +
  theme(axis.text.y = element_text(size = 8))